# train data
data = read.csv('../data/train.csv')
data = data[-1]
# Mahalanobis distance 2D
for (i in 1:9) {
name1 = colnames(data)[i]
for (j in (i+1):10) {
name2 = colnames(data)[j]
hw = data.frame(name1=data[, i], name2=data[, j])
ord = order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=T)
outlier = rep(F, nrow(hw))
outlier[ord[1:100]] = T
pch = outlier * 16
plot(hw, pch=pch, xlab=name1, ylab=name2)
}
}













































# Mahalanobis distance 3D
library(rgl)
name1 = colnames(data)[1]
name2 = colnames(data)[2]
name3 = colnames(data)[3]
hw = data.frame(name1=data[, 1], name2=data[, 2], name3=data[, 3])
ord = order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=T)
outlier = rep(F, nrow(hw))
outlier[ord[1:100]] = T
col = outlier + 1
plot3d(hw$name1, hw$name2, hw$name3, type='s', col=col, xlab=name1, ylab=name2, zlab=name3)
# autoencoders, init h2o and train/save model
#library(h2o)
#h2o.init(nthreads = -1)
#data_hf = as.h2o(data)
#features = setdiff(colnames(data_hf), "Cover_Type")
#model_nn = h2o.deeplearning(x = features,
# training_frame = data_hf,
# model_id = 'autoencoder',
# autoencoder = T,
# ignore_const_cols = F,
# seed = 57,
# hidden = c(25, 10, 25),
# epochs = 100,
# activation = 'Tanh')
#h2o.saveModel(model_nn, path='autoencoder', force = T)
#model_nn
# load Autoencoder model and identify outliers
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
h2o.init(nthreads = -1)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\DjoksPC\AppData\Local\Temp\Rtmp0kL6pp/h2o_Djok_started_from_r.out
## C:\Users\DjoksPC\AppData\Local\Temp\Rtmp0kL6pp/h2o_Djok_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 40 milliseconds
## H2O cluster timezone: EET
## H2O data parsing timezone: UTC
## H2O cluster version: 3.22.1.1
## H2O cluster version age: 2 months and 5 days
## H2O cluster name: H2O_started_from_R_Djok_bti730
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.76 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.2 (2018-12-20)
data_hf = as.h2o(data)
##
|
| | 0%
|
|=================================================================| 100%
model_nn = h2o.loadModel("autoencoder/autoencoder")
anomaly = h2o.anomaly(model_nn, data_hf) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
mutate(Cover_Type = as.vector(data$Cover_Type))
mean_mse = anomaly %>%
group_by(Cover_Type) %>%
summarise(mean = mean(Reconstruction.MSE))
ggplot(anomaly, aes(x = as.numeric(rowname), y = Reconstruction.MSE, color = as.factor(Cover_Type))) +
geom_point(alpha = 0.3) +
geom_hline(data = mean_mse, aes(yintercept = mean, color = as.factor(Cover_Type))) +
scale_color_brewer(palette = "Set1") +
labs(x = "instance number",
color = "Cover_Type")

# LOF
library(dbscan)
for (i in 1:9) {
name1 = colnames(data)[i]
for (j in (i+1):10) {
name2 = colnames(data)[j]
lof_data = data.frame(name1 = data[, i], name2 = data[, j])
loff = lof(lof_data, k = 5)
plot(lof_data, pch = ".", main = "LOF (k=3)", xlab=name1, ylab=name2)
points(lof_data, cex = (loff-1)*3, pch = 1, col="red")
text(lof_data[loff>2,], labels = round(loff, 1)[loff>2], pos = 3)
}
}













































# Isolation Forest
# library(devtools)
# install_github("Zelazny7/isofor")
library(isofor)
data = read.csv('../data/train.csv')
data = data[-1]
for (i in 1:9) {
name1 = colnames(data)[i]
for (j in (i+1):10) {
name2 = colnames(data)[j]
hw = data.frame(name1=data[, i], name2=data[, j])
mod = iForest(X = hw, 200, 32)
p = predict(mod, hw)
col = ifelse(p > quantile(p, 0.95), "red", "blue")
plot(hw, col=col, xlab=name1, ylab=name2)
}
}













































library(mrfDepth)
data = read.csv('../data/train.csv')
# data = data[-1]
selected_data = data[, c('Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
'Hillshade_9am', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points')
]
result = outlyingness(x = selected_data)
outliers = which(!result$flagX)
plot(selected_data)
head(selected_data$outliers)
## NULL
points(selected_data[outliers,], col = "red")
